Store Sales Prediction With Linear Regression.¶

Import Library¶

#Basic python library which need to import
import pandas as pd
import numpy as np

#Date stuff
from datetime import datetime
from datetime import timedelta

#Library for Nice graphing
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
%matplotlib inline

#Library for statistics operation
import scipy.stats as stats

# Date Time library
from datetime import datetime

#Machine learning Library
import statsmodels.api as sm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

Import Data from Csv files¶

# Importing training data set
train = pd.read_csv("train.csv")

#Import Test Data
test=pd.read_csv("test.csv")

# Import Store data set
stores = pd.read_csv("stores.csv")

# Now import features data set
feature = pd.read_csv("features.csv")

Merge the data sets:¶

  -(train+Store+Feature)
  -(test+Store+Feature)

# For Train data set
train_bt = pd.merge(train,stores) 
train = pd.merge(train_bt,feature)

#For test data set
test_bt = pd.merge(test,stores)
test= pd.merge(test_bt,feature)

train.head(2)

test.head(2)

print (train.info())
print ("*****************************************")
print (test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 421570 entries, 0 to 421569
Data columns (total 16 columns):
Store           421570 non-null int64
Dept            421570 non-null int64
Date            421570 non-null object
Weekly_Sales    421570 non-null float64
IsHoliday       421570 non-null bool
Type            421570 non-null object
Size            421570 non-null int64
Temperature     421570 non-null float64
Fuel_Price      421570 non-null float64
MarkDown1       150681 non-null float64
MarkDown2       111248 non-null float64
MarkDown3       137091 non-null float64
MarkDown4       134967 non-null float64
MarkDown5       151432 non-null float64
CPI             421570 non-null float64
Unemployment    421570 non-null float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 51.9+ MB
None
*****************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 115064 entries, 0 to 115063
Data columns (total 15 columns):
Store           115064 non-null int64
Dept            115064 non-null int64
Date            115064 non-null object
IsHoliday       115064 non-null bool
Type            115064 non-null object
Size            115064 non-null int64
Temperature     115064 non-null float64
Fuel_Price      115064 non-null float64
MarkDown1       114915 non-null float64
MarkDown2       86437 non-null float64
MarkDown3       105235 non-null float64
MarkDown4       102176 non-null float64
MarkDown5       115064 non-null float64
CPI             76902 non-null float64
Unemployment    76902 non-null float64
dtypes: bool(1), float64(9), int64(3), object(2)
memory usage: 13.3+ MB
None

Select only positive weekly sales¶

# tale only those values whose sales is positive. 
train = train[train['Weekly_Sales']>0]

Data Description:¶

1. Training Data¶

numeric_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object']]

# Train Numerical Data
train_num=train[numeric_var_train]

# Train Categorical Data
train_cat=train[cat_var_train]

print (numeric_var_train)
print (cat_var_train)

['Store', 'Dept', 'Weekly_Sales', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']
['Date', 'Type']

# Use a general function that returns multiple values
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

num_summary=train_num.apply(lambda x: var_summary(x)).T
num_summary

def cat_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.value_counts()], 
                  index=['N', 'NMISS', 'ColumnsNames'])

cat_summary=train_cat.apply(lambda x: cat_summary(x))
cat_summary

2. Testing Data¶

numeric_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['object']]

# Train Numerical Data
test_num=test[numeric_var_test]

# Train Categorical Data
test_cat=test[cat_var_test]

print (numeric_var_test)
print (cat_var_test)

['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']
['Date', 'Type']

# Numerical data summary report
num_summary=test_num.apply(lambda x: var_summary(x)).T

num_summary.head()

# categorical data summary report
def cat_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.value_counts()], 
                  index=['N', 'NMISS', 'ColumnsNames'])

cat_summary=test_cat.apply(lambda x: cat_summary(x))
cat_summary

# Run Pandas profilingto see the over all report
import pandas_profiling
pandas_profiling.ProfileReport(train)

pandas_profiling.ProfileReport(test)

Correlation matrix¶

# Correlation for train data
train_corr=pd.DataFrame(train.corr())
train_corr.head()

# Correlation for test data
test_corr=pd.DataFrame(test.corr())
test_corr.head()

# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(train.corr())

<matplotlib.axes._subplots.AxesSubplot at 0xdb38b38>

# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(test.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x1870d4e0>

Data Exploratory Analysis:¶

# Store wise sales plot
train['Store'].value_counts(normalize=True).plot(kind = 'bar',fig=(4,5))

<matplotlib.axes._subplots.AxesSubplot at 0x19c1e400>

# weekly sales plot
sns.distplot(train.Weekly_Sales)

<matplotlib.axes._subplots.AxesSubplot at 0x19c1e1d0>

# Store wise sales
train.plot(kind='line', x='Weekly_Sales', y='Store', alpha=0.5)

<matplotlib.axes._subplots.AxesSubplot at 0x573d198>

Sales Vs Type:¶

# Weekly sales Type wise
sns.barplot(x=train["Weekly_Sales"],y=train["Type"])

<matplotlib.axes._subplots.AxesSubplot at 0x1249be80>

Department Weise sales:¶

train.plot(kind='line', x='Dept', y='Weekly_Sales', alpha=0.5,fig=(4,5))

<matplotlib.axes._subplots.AxesSubplot at 0x175ffd68>

Missing value Treatment¶

print (train.isnull().sum())
print ("*"*30)
print (test.isnull().sum())

Store                0
Dept                 0
Date                 0
Weekly_Sales         0
IsHoliday            0
Type                 0
Size                 0
Temperature          0
Fuel_Price           0
MarkDown1       270031
MarkDown2       309308
MarkDown3       283561
MarkDown4       285694
MarkDown5       269283
CPI                  0
Unemployment         0
dtype: int64
******************************
Store               0
Dept                0
Date                0
IsHoliday           0
Type                0
Size                0
Temperature         0
Fuel_Price          0
MarkDown1         149
MarkDown2       28627
MarkDown3        9829
MarkDown4       12888
MarkDown5           0
CPI             38162
Unemployment    38162
dtype: int64

Imputing it with its mean¶

test['CPI']=test.groupby(['Dept'])['CPI'].transform(lambda x: x.fillna(x.mean()))
test['Unemployment']=test.groupby(['Dept'])['Unemployment'].transform(lambda x: x.fillna(x.mean()))

Other Missing Value Treatment like Markdown, Imputing it with Zero(No Markdown)

train=train.fillna(0)
test=test.fillna(0)

# Recheck the missing values.

print (train.isnull().sum())
print ("*"*30)
print (test.isnull().sum())

Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
dtype: int64
******************************
Store           0
Dept            0
Date            0
IsHoliday       0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
dtype: int64

Outlier Treatment¶

train.Weekly_Sales=np.where(train.Weekly_Sales>100000, 100000,train.Weekly_Sales)

train.Weekly_Sales.plot.hist(bins=25)

<matplotlib.axes._subplots.AxesSubplot at 0x176834e0>

Feature Extraction¶

In this section, we select the appropriate features to train our classifier. Here, we create new features based on existing features. We also convert categorical features into numeric form.

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420212 entries, 0 to 421569
Data columns (total 16 columns):
Store           420212 non-null int64
Dept            420212 non-null int64
Date            420212 non-null object
Weekly_Sales    420212 non-null float64
IsHoliday       420212 non-null bool
Type            420212 non-null object
Size            420212 non-null int64
Temperature     420212 non-null float64
Fuel_Price      420212 non-null float64
MarkDown1       420212 non-null float64
MarkDown2       420212 non-null float64
MarkDown3       420212 non-null float64
MarkDown4       420212 non-null float64
MarkDown5       420212 non-null float64
CPI             420212 non-null float64
Unemployment    420212 non-null float64
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 71.7+ MB

Date Feature¶

train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# Extract date features
train['Date_dayofweek'] =train['Date'].dt.dayofweek
train['Date_month'] =train['Date'].dt.month 
train['Date_year'] =train['Date'].dt.year
train['Date_day'] =train['Date'].dt.day 

# For test data
test['Date_dayofweek'] =test['Date'].dt.dayofweek
test['Date_month'] =test['Date'].dt.month 
test['Date_year'] =test['Date'].dt.year
test['Date_day'] =test['Date'].dt.day

Type Feature Details¶

print (train.Type.value_counts())
print ("*"*30)
print (test.Type.value_counts())

A    214961
B    162787
C     42464
Name: Type, dtype: int64
******************************
A    58713
B    44500
C    11851
Name: Type, dtype: int64

IsHoliday Feature Details¶

print (train.IsHoliday.value_counts())
print ("*"*30)
print (test.IsHoliday.value_counts())

False    390652
True      29560
Name: IsHoliday, dtype: int64
******************************
False    106136
True       8928
Name: IsHoliday, dtype: int64

# Combine train and test with Key
train_test_data = [train, test]

Converting Categorical Variable 'Type' into Numerical Variable For A=1 , B=2, C=3

type_mapping = {"A": 1, "B": 2, "C": 3}
for dataset in train_test_data:
    dataset['Type'] = dataset['Type'].map(type_mapping)

Converting Categorical Variable 'IsHoliday' into Numerical Variable

type_mapping = {False: 0, True: 1}
for dataset in train_test_data:
    dataset['IsHoliday'] = dataset['IsHoliday'].map(type_mapping)

Creating Extra Holiday Variable.If that week comes under extra holiday then 1(=Yes) else 2(=No)

Making New Holiday Variable Based on Given Data....¶

# For Train Data Set
train['Super_Bowl'] = np.where((train['Date']==datetime(2010, 2, 12)) | (train['Date']==datetime(2011, 2, 11)) | (train['Date']==datetime(2012, 2, 10)) | (train['Date']==datetime(2013, 2, 8)),1,0)
train['Labour_Day'] = np.where((train['Date']==datetime(2010, 9, 10)) | (train['Date']==datetime(2011, 9, 9)) | (train['Date']==datetime(2012, 9, 7)) | (train['Date']==datetime(2013, 9, 6)),1,0)
train['Thanksgiving'] = np.where((train['Date']==datetime(2010, 11, 26)) | (train['Date']==datetime(2011, 11, 25)) | (train['Date']==datetime(2012, 11, 23)) | (train['Date']==datetime(2013, 11, 29)),1,0)
train['Christmas'] = np.where((train['Date']==datetime(2010, 12, 31)) | (train['Date']==datetime(2011, 12, 30)) | (train['Date']==datetime(2012, 12, 28)) | (train['Date']==datetime(2013, 12, 27)),1,0)

#For Test Data set........................................................................
test['Super_Bowl'] = np.where((test['Date']==datetime(2010, 2, 12)) | (test['Date']==datetime(2011, 2, 11)) | (test['Date']==datetime(2012, 2, 10)) | (test['Date']==datetime(2013, 2, 8)),1,0)
test['Labour_Day'] = np.where((test['Date']==datetime(2010, 9, 10)) | (test['Date']==datetime(2011, 9, 9)) | (test['Date']==datetime(2012, 9, 7)) | (test['Date']==datetime(2013, 9, 6)),1,0)
test['Thanksgiving'] = np.where((test['Date']==datetime(2010, 11, 26)) | (test['Date']==datetime(2011, 11, 25)) | (test['Date']==datetime(2012, 11, 23)) | (test['Date']==datetime(2013, 11, 29)),1,0)
test['Christmas'] = np.where((test['Date']==datetime(2010, 12, 31)) | (test['Date']==datetime(2011, 12, 30)) | (test['Date']==datetime(2012, 12, 28)) | (test['Date']==datetime(2013, 12, 27)),1,0)

# Change the isHoliday value depending on these new holidays...
train['IsHoliday']=train['IsHoliday']|train['Super_Bowl']|train['Labour_Day']|train['Thanksgiving']|train['Christmas']
test['IsHoliday']=test['IsHoliday']|test['Super_Bowl']|test['Labour_Day']|test['Thanksgiving']|test['Christmas']

# Count of holiday for train data
print (train.Christmas.value_counts())
print (train.Super_Bowl.value_counts())
print (train.Thanksgiving.value_counts())
print (train.Labour_Day.value_counts())

0    414303
1      5909
Name: Christmas, dtype: int64
0    411339
1      8873
Name: Super_Bowl, dtype: int64
0    414266
1      5946
Name: Thanksgiving, dtype: int64
0    411380
1      8832
Name: Labour_Day, dtype: int64

# Count of holiday for Test data
print (test.Christmas.value_counts())
print (test.Super_Bowl.value_counts())
print (test.Thanksgiving.value_counts())
print (test.Labour_Day.value_counts())

0    112076
1      2988
Name: Christmas, dtype: int64
0    112100
1      2964
Name: Super_Bowl, dtype: int64
0    112088
1      2976
Name: Thanksgiving, dtype: int64
0    115064
Name: Labour_Day, dtype: int64

# Since we have Imputed IsHoliday according to Extra holidays..These extra holiday variable has redundant..
# Droping the Extra holiday variables because its redundant..
dp=['Super_Bowl','Labour_Day','Thanksgiving','Christmas']

train.drop(dp,axis=1,inplace=True)
test.drop(dp,axis=1,inplace=True)

train.head(2)

Feature Selection¶

Droping irrevelent variable:¶

-Since we have imputed markdown variables therefore we will not be removing the all markdown variables.
-Removing MarkDown5 because its Highly Skewed.

features_drop=['Unemployment','CPI','MarkDown5']
train=train.drop(features_drop, axis=1)
test=test.drop(features_drop, axis=1)

train.head(2)

test.head(2)

# Converting all float var to int integer..
for var in train:
    if train[var].dtypes == float:
        train[var]=train[var].astype(int)
        
for var in test:
    if test[var].dtypes == float:
        test[var]=test[var].astype(int)

First we should Check Y is normally distributed or not¶

import seaborn as sns
sns.distplot(train.Weekly_Sales)

<matplotlib.axes._subplots.AxesSubplot at 0xc2e0f98>

As we can see above fig y is not normally distributed so we will take log of Y

train['Weekly_Sales']=np.log(train['Weekly_Sales']+1)

sns.distplot(train.Weekly_Sales)

<matplotlib.axes._subplots.AxesSubplot at 0xc3a8588>

## Use Box cox transformation need to plot.

# Now check residuals
from scipy import stats
import pylab

stats.probplot(train.Weekly_Sales, dist="norm", plot=pylab )
pylab.show()

Classification & Accuracy¶

Define training and testing set¶

#### train X= Exery thing except Weekly_Sales
train_X=train.drop(['Weekly_Sales','Date'], axis=1)

#### train Y= Only Weekly_Sales 
train_y=train['Weekly_Sales'] 

#### Test_X
test_X=test.drop('Date',axis=1).copy()

train_X.shape, train_y.shape, test_X.shape

((420212, 15), (420212,), (115064, 15))

Building models & comparing their RMSE values¶

1.Linear Regression¶

## Methood 1..
clf = LinearRegression()
clf.fit(train_X, train_y)
y_pred_linear=clf.predict(test_X)
acc_linear=round( clf.score(train_X, train_y) * 100, 2)
print ('scorbe:'+str(acc_linear) + ' percent')

scorbe:11.03 percent

import statsmodels.api as sm

train_x = sm.add_constant(train_X)
lm=sm.OLS(train_y,train_X).fit()

print(lm.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Weekly_Sales   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.110
Method:                 Least Squares   F-statistic:                     3720.
Date:                Tue, 03 Sep 2019   Prob (F-statistic):               0.00
Time:                        21:56:39   Log-Likelihood:            -8.6782e+05
No. Observations:              420212   AIC:                         1.736e+06
Df Residuals:                  420197   BIC:                         1.736e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Store             -0.0134      0.000    -56.412      0.000      -0.014      -0.013
Dept               0.0016   9.65e-05     16.264      0.000       0.001       0.002
IsHoliday         -0.0520      0.012     -4.169      0.000      -0.076      -0.028
Type               0.1135      0.008     14.836      0.000       0.099       0.129
Size            1.084e-05   8.38e-08    129.421      0.000    1.07e-05     1.1e-05
Temperature       -0.0036      0.000    -21.059      0.000      -0.004      -0.003
Fuel_Price         0.0329      0.008      4.130      0.000       0.017       0.049
MarkDown1       1.071e-05   1.02e-06     10.539      0.000    8.72e-06    1.27e-05
MarkDown2      -8.538e-07   6.17e-07     -1.384      0.166   -2.06e-06    3.56e-07
MarkDown3       3.695e-06   5.59e-07      6.613      0.000     2.6e-06    4.79e-06
MarkDown4      -6.834e-06   1.43e-06     -4.794      0.000   -9.63e-06   -4.04e-06
Date_dayofweek    35.7844      3.199     11.187      0.000      29.515      42.054
Date_month         0.0160      0.001     16.088      0.000       0.014       0.018
Date_year         -0.0676      0.006    -10.621      0.000      -0.080      -0.055
Date_day          -0.0004      0.000     -1.049      0.294      -0.001       0.000
==============================================================================
Omnibus:                    79726.360   Durbin-Watson:                   1.429
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           156534.468
Skew:                          -1.156   Prob(JB):                         0.00
Kurtosis:                       4.896   Cond. No.                     1.63e+08
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.63e+08. This might indicate that there are
strong multicollinearity or other numerical problems.

2. Random Forest¶

clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_X, train_y)
y_pred_rf=clf.predict(test_X)
acc_rf= round(clf.score(train_X, train_y) * 100, 2)
print ("Accuracy: %i %% \n"%acc_rf)

Accuracy: 99 %

3. Decision Tree¶

clf=DecisionTreeRegressor()
clf.fit(train_X, train_y)
y_pred_dt= clf.predict(test_X)
acc_dt = round( clf.score(train_X, train_y) * 100, 2)
print (str(acc_dt) + ' percent')

100.0 percent

Comparing Models¶

Let's compare the accuracy score of all the regression models used above.¶

models = pd.DataFrame({
    'Model': ['Linear Regression','Random Forest','Decision Tree'],
    
    'Score': [acc_linear, acc_rf,acc_dt]
    })

models.sort_values(by='Score', ascending=False)

Predicting Sales value for test data based on highest score model.¶

# Prediction value using Random Forest model..
submission = pd.DataFrame({
        "Store_Dept_Date": test.Store.astype(str)+'_'+test.Dept.astype(str)+'_'+test.Date.astype(str),
        "Weekly_Sales": y_pred_rf
    })

submission.to_csv('weekly_sales predicted.csv', index=False)
#submission.to_excel(writer,'Weekly_sales Pred',index=False)

submission.head()

##########################End##########

	Store	Dept	Date	Weekly_Sales	IsHoliday	Type	Size	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment
0	1	1	2010-02-05	24924.50	False	A	151315	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106
1	1	2	2010-02-05	50605.27	False	A	151315	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106

	Store	Dept	Date	IsHoliday	Type	Size	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment
0	1	1	2012-11-02	False	A	151315	55.32	3.386	6766.44	5147.7	50.82	3639.9	2737.42	223.462779	6.573
1	1	2	2012-11-02	False	A	151315	55.32	3.386	6766.44	5147.7	50.82	3639.9	2737.42	223.462779	6.573

	N	SUM	MEAN	MEDIAN	STD	VAR	MIN	P1	P5	P10	P25	P50	P75	P90	P95	P99	MAX
Store	115064.0	2.558817e+06	22.238207	22.000	12.809930	1.640943e+02	1.000	1.000	3.000	5.000	11.000	22.000	33.000	40.000	43.000	45.000	45.000
Dept	115064.0	5.101883e+06	44.339524	37.000	30.656410	9.398155e+02	1.000	1.000	4.000	7.000	18.000	37.000	74.000	92.000	95.000	98.000	99.000
Size	115064.0	1.570597e+10	136497.688921	140167.000	61106.926438	3.734056e+09	34875.000	34875.000	39690.000	39910.000	93638.000	140167.000	202505.000	204184.000	206302.000	219622.000	219622.000
Temperature	115064.0	6.206760e+06	53.941804	54.470	18.724153	3.505939e+02	-7.290	11.440	23.980	29.970	39.820	54.470	67.350	79.480	83.820	92.140	101.950
Fuel_Price	115064.0	4.121070e+05	3.581546	3.606	0.239442	5.733244e-02	2.872	2.957	3.161	3.227	3.431	3.606	3.766	3.866	3.951	4.079	4.125

Number of variables	16
Number of observations	421570
Total Missing (%)	21.1%
Total size in memory	51.9 MiB
Average record size in memory	129.0 B

Distinct count	2145
Unique (%)	0.5%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

	Date	Type
N	420212	420212
NMISS	0	0
ColumnsNames	2011-12-23 3018 2011-11-25 3016 2011-12-...	A 214961 B 162787 C 42464 Name: Type...

	Date	Type
N	115064	115064
NMISS	0	0
ColumnsNames	2012-12-21 3002 2012-12-07 2989 2012-12-...	A 58713 B 44500 C 11851 Name: Type, d...

Numeric	13
Categorical	2
Boolean	1
Date	0
Text (Unique)	0
Rejected	0
Unsupported	0

Minimum	126.06
5-th percentile	126.5
Q1	132.02
Median	182.32
Q3	212.42
95-th percentile	221.94
Maximum	227.23
Range	101.17
Interquartile range	80.394

Standard deviation	39.159
Coef of variation	0.22873
Kurtosis	-1.8297
Mean	171.2
MAD	38.066
Skewness	0.085219
Sum	72174000
Variance	1533.4
Memory size	6.4 MiB

Value	Count	Frequency (%)
129.8555333	711	0.2%
131.1083333	708	0.2%
129.84596670000002	707	0.2%
130.38490320000002	706	0.2%
130.683	706	0.2%
131.0756667	706	0.2%
130.6457931	706	0.2%
130.7196333	705	0.2%
130.4546207	705	0.2%
129.98454840000002	704	0.2%
Other values (2135)	414506	98.3%

2011-12-23	3027
2011-11-25	3021
2011-12-16	3013
Other values (140)	412509

Standard deviation	30.492
Coef of variation	0.68893
Kurtosis	-1.2156
Mean	44.26
MAD	26.537
Skewness	0.35822
Sum	18658822
Variance	929.77
Memory size	6.4 MiB

Minimum	2.472
5-th percentile	2.653
Q1	2.933
Median	3.452
Q3	3.738
95-th percentile	4.029
Maximum	4.468
Range	1.996
Interquartile range	0.805

Standard deviation	0.45851
Coef of variation	0.13642
Kurtosis	-1.1854
Mean	3.361
MAD	0.4032
Skewness	-0.1049
Sum	1416900
Variance	0.21024
Memory size	6.4 MiB

Value	Count	Frequency (%)
126.064	678	0.2%
126.0766452	679	0.2%
126.08545159999998	675	0.2%
126.08929029999999	682	0.2%
126.1019355	686	0.2%

Value	Count	Frequency (%)
2011-12-23	3027	0.7%
2011-11-25	3021	0.7%
2011-12-16	3013	0.7%
2011-12-09	3010	0.7%
2012-02-17	3007	0.7%
2011-12-30	3003	0.7%
2012-02-10	3001	0.7%
2011-12-02	2994	0.7%
2012-03-02	2990	0.7%
2012-10-12	2990	0.7%
Other values (133)	391514	92.9%

Value	Count	Frequency (%)
1	6435	1.5%
10	6435	1.5%
38	6435	1.5%
21	6435	1.5%
67	6435	1.5%
16	6435	1.5%
14	6435	1.5%
13	6435	1.5%
79	6435	1.5%
81	6435	1.5%
Other values (71)	357220	84.7%

Value	Count	Frequency (%)
1	6435	1.5%
2	6435	1.5%
3	6435	1.5%
4	6435	1.5%
5	6347	1.5%

Distinct count	2278
Unique (%)	0.5%
Missing (%)	64.3%
Missing (n)	270889
Infinite (%)	0.0%
Infinite (n)	0

Minimum	0.27
5-th percentile	149.19
Q1	2240.3
Median	5347.4
Q3	9210.9
95-th percentile	21801
Maximum	88647
Range	88646
Interquartile range	6970.6

Standard deviation	8291.2
Coef of variation	1.1442
Kurtosis	17.606
Mean	7246.4
MAD	5262.8
Skewness	3.3418
Sum	1091900000
Variance	68744000
Memory size	6.4 MiB

Distinct count	1500
Unique (%)	0.4%
Missing (%)	73.6%
Missing (n)	310322
Infinite (%)	0.0%
Infinite (n)	0

Minimum	-265.76
5-th percentile	1.95
Q1	41.6
Median	192
Q3	1926.9
95-th percentile	16497
Maximum	104520
Range	104790
Interquartile range	1885.3

Standard deviation	9475.4
Coef of variation	2.8415
Kurtosis	37.59
Mean	3334.6
MAD	4690.4
Skewness	5.4413
Sum	370970000
Variance	89782000
Memory size	6.4 MiB

Distinct count	1663
Unique (%)	0.4%
Missing (%)	67.5%
Missing (n)	284479
Infinite (%)	0.0%
Infinite (n)	0

Minimum	-29.1
5-th percentile	0.65
Q1	5.08
Median	24.6
Q3	103.99
95-th percentile	1059.9
Maximum	141630
Range	141660
Interquartile range	98.91

Standard deviation	9623.1
Coef of variation	6.6854
Kurtosis	77.688
Mean	1439.4
MAD	2578.1
Skewness	8.3995
Sum	197330000
Variance	92604000
Memory size	6.4 MiB

Distinct count	1945
Unique (%)	0.5%
Missing (%)	68.0%
Missing (n)	286603
Infinite (%)	0.0%
Infinite (n)	0